# import packages
import pandas as pd
import matplotlib.pyplot as plt

# read in op 2014 data
keep_cols = ['taxpayer_id_new', 'isEIC', 'aud_no_research_audits']
dataBISG = pd.read_csv("/REDACTED/data/final/individualBISG2014_full_final.csv", usecols = keep_cols)

## keep op EITC
dataBISG = dataBISG[dataBISG.isEIC == 1]

# read in refundable credit overclaiming predictions
keep_cols = ['taxpayer_id_new', 'sort_var_avg']
preds_f = pd.read_csv('/REDACTED/eitc_predictions_all.csv', usecols = keep_cols)
# we have more predictions than we do members of eitc in op 2014

# merge predictions onto op eitc 2014 population
dataBISG_m = dataBISG.merge(preds_f, how = 'left', on = 'taxpayer_id_new')
dataBISG = dataBISG_m

# divide into percentiles of predicted refundable credit overclaiming
dataBISG['bin'] = pd.qcut(dataBISG['sort_var_avg'], 100, labels=False)

dataBISG = dataBISG[dataBISG['sort_var_avg'].notnull()] # removes one taxpayer

list_of_bins = dataBISG.bin.unique().tolist()
list_of_bins.sort()

# loop over predicted non-compliance bins and calculate audit rates
bin_means = []
aud_rates = []
for bin in list_of_bins:
    print(bin)
    temp = dataBISG[dataBISG['bin'] == bin]
    print(len(temp))
    bin_mean = (temp['sort_var_avg'].min() + temp['sort_var_avg'].max()) / 2
    aud_rate = len(temp[temp['aud_no_research_audits'] == 1]) / len(temp)
    bin_means.append(bin_mean)
    aud_rates.append(aud_rate)

# zip results into a dataframe
prob_df = pd.DataFrame(list(zip(bin_means, aud_rates, list_of_bins)), columns = ["bin_mean", "aud_rate", "bin"])

# read in total underreportaxpayer_idg predictions
keep_cols = ['taxpayer_id_new', 'sort_var_avg']
preds_f = pd.read_csv('/REDACTED/eitc_predictions_all_total_underreportaxpayer_idg.csv', usecols = keep_cols)
preds_f = preds_f.rename(columns={'sort_var_avg':'sort_var_avg_total_underreportaxpayer_idg'})

# merge onto op 2014 
dataBISG_m = dataBISG.merge(preds_f, how = 'left', on = 'taxpayer_id_new')
dataBISG = dataBISG_m
# we have more predictions than we do members of eitc in op 2014

# create 100 bins of predicted underreportaxpayer_idg
dataBISG['bin_total_underreportaxpayer_idg'] = pd.qcut(dataBISG['sort_var_avg_total_underreportaxpayer_idg'], 100, labels=False)

dataBISG = dataBISG[dataBISG['sort_var_avg_total_underreportaxpayer_idg'].notnull()] # removes nobody

list_of_bins = dataBISG.bin_total_underreportaxpayer_idg.unique().tolist()
list_of_bins.sort()

# loop over predicted non-compliance bins and calculate audit rates
bin_means = []
aud_rates = []
for bin in list_of_bins:
    print(bin)
    temp = dataBISG[dataBISG['bin_total_underreportaxpayer_idg'] == bin]
    print(len(temp))
    print(temp.sort_var_avg_total_underreportaxpayer_idg.min())
    print(temp.sort_var_avg_total_underreportaxpayer_idg.max())
    bin_mean = (temp['sort_var_avg_total_underreportaxpayer_idg'].min() + temp['sort_var_avg_total_underreportaxpayer_idg'].max()) / 2
    aud_rate = len(temp[temp['aud_no_research_audits'] == 1]) / len(temp)
    bin_means.append(bin_mean)
    aud_rates.append(aud_rate)

# zip results into a dataframe
prob_df_total_underreportaxpayer_idg = pd.DataFrame(list(zip(bin_means, aud_rates, list_of_bins)), columns = ["bin_mean", "aud_rate", "bin"])  

# plot audit rates by prediction percentiles for both sets of predictions
ax1 = plt.subplot() 
ax1.plot(prob_df_total_underreportaxpayer_idg.bin, prob_df_total_underreportaxpayer_idg.aud_rate, linestyle='dashed', color = 'black', alpha = 0.7, markersize = 2, label='Total Underreportaxpayer_idg Predictions')
ax1.plot(prob_df.bin, prob_df.aud_rate, color = 'black', alpha = 0.7, markersize = 2, label='Refundable Credit Overclaiming Predictions')
legend = ax1.legend()
plt.xlabel('Prediction Percentile')
plt.ylabel('Audit Rate')
plt.savefig("/REDACTED/audit_rates_by_prediction_percentiles.png") # top panel of figure a.19, works!
plt.close()


#########################
### Top 5 Percentiles ###
#########################

# divide into 400 bins of predicted refundable credit overclaiming
dataBISG['bin'] = pd.qcut(dataBISG['sort_var_avg'], 400, labels=False)

list_of_bins = dataBISG.bin.unique().tolist()
list_of_bins.sort()

# subset to top 20 bins, corresponding to top 5 percentiles
list_of_bins = [x for x in list_of_bins if x >= 380]

# loop over predicted non-compliance bins and calculate audit rates
bin_means = []
aud_rates = []
for bin in list_of_bins:
    print(bin)
    temp = dataBISG[dataBISG['bin'] == bin]
    print(len(temp))
    bin_mean = (temp['sort_var_avg'].min() + temp['sort_var_avg'].max()) / 2
    aud_rate = len(temp[temp['aud_no_research_audits'] == 1]) / len(temp)
    bin_means.append(bin_mean)
    aud_rates.append(aud_rate)

# zip results into a dataframe
prob_df = pd.DataFrame(list(zip(bin_means, aud_rates, list_of_bins)), columns = ["bin_mean", "aud_rate", "bin"]) 

# divide into 400 bins of predicted total underreportaxpayer_idg
dataBISG['bin_total_underreportaxpayer_idg'] = pd.qcut(dataBISG['sort_var_avg_total_underreportaxpayer_idg'], 400, labels=False)

list_of_bins = dataBISG.bin_total_underreportaxpayer_idg.unique().tolist()
list_of_bins.sort()

# subset to top 20 bins, corresponding to top 5 percentiles
list_of_bins = [x for x in list_of_bins if x >= 380] 

# loop over predicted non-compliance bins and calculate audit rates
bin_means_total_underreportaxpayer_idg = []
aud_rates_total_underreportaxpayer_idg = []
for bin in list_of_bins:
    print(bin)
    temp = dataBISG[dataBISG['bin_total_underreportaxpayer_idg'] == bin]
    print(len(temp))
    print(temp.sort_var_avg_total_underreportaxpayer_idg.min())
    print(temp.sort_var_avg_total_underreportaxpayer_idg.max())
    bin_mean = (temp['sort_var_avg_total_underreportaxpayer_idg'].min() + temp['sort_var_avg_total_underreportaxpayer_idg'].max()) / 2
    aud_rate = len(temp[temp['aud_no_research_audits'] == 1]) / len(temp)
    bin_means_total_underreportaxpayer_idg.append(bin_mean)
    aud_rates_total_underreportaxpayer_idg.append(aud_rate)

# zip results into a dataframe
prob_df_total_underreportaxpayer_idg = pd.DataFrame(list(zip(bin_means_total_underreportaxpayer_idg, aud_rates_total_underreportaxpayer_idg, list_of_bins)), columns = ["bin_mean", "aud_rate", "bin"])

# plot audit rates by top 20 prediction bins for both sets of predictions
ax1 = plt.subplot() 
ax1.plot(prob_df_total_underreportaxpayer_idg.bin/4, prob_df_total_underreportaxpayer_idg.aud_rate, linestyle='dashed', color = 'black', alpha = 0.7, markersize = 2, label='Total Underreportaxpayer_idg Predictions')
ax1.plot(prob_df.bin/4, prob_df.aud_rate, color = 'black', alpha = 0.7, markersize = 2, label='Refundable Credit Overclaiming Predictions')
legend = ax1.legend()
plt.xlabel('Prediction Percentile')
plt.ylabel('Audit Rate')
plt.savefig("/REDACTED/audit_rates_by_prediction_percentiles_top5.png") # bottom panel of fig a.19, also works!
plt.close() 